import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
import cv2
import sys
 
def get_engine(engine_path):
    # If a serialized engine exists, use it instead of building an engine.
    print("Reading engine from file {}".format(engine_path))
    with open(engine_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
        return runtime.deserialize_cuda_engine(f.read())
 
TRT_LOGGER = trt.Logger()
# engine = get_engine("yolov4_1.trt")
 
 
engine = get_engine("test.engine")
print(engine)
for binding in engine:
        size = trt.volume(engine.get_binding_shape(binding)) * 1
        dims = engine.get_binding_shape(binding)
        print(size)
        print(dims)
        print(binding)
        print(engine.binding_is_input(binding))
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        print("dtype = ", dtype)
 
 
 
engine = get_engine("shuf.engine")
context = engine.create_execution_context()
 
def get_landmarks(img):
    resized = cv2.resize(img, (640, 640), interpolation=cv2.INTER_LINEAR)
    img_in = cv2.cvtColor(resized, cv2.COLOR_BGR2RGB)
    # cv2.imwrite("tmp.jpg", img_in)
    img_in = np.transpose(img_in, (2, 0, 1)).astype(np.float32)
    img_in = np.expand_dims(img_in, axis=0)
    img_in /= 255.0

    sys.exit()
    h_input = cuda.pagelocked_empty(trt.volume(context.get_binding_shape(0)), dtype=np.float32)
    
    h_output_8 = cuda.pagelocked_empty(trt.volume(context.get_binding_shape(1)), dtype=np.float32)
    h_output_16 = cuda.pagelocked_empty(trt.volume(context.get_binding_shape(2)), dtype=np.float32)
    h_output_32 = cuda.pagelocked_empty(trt.volume(context.get_binding_shape(3)), dtype=np.float32)

    # Allocate device memory for inputs and outputs.
    d_input = cuda.mem_alloc(h_input.nbytes)
    d_output_8 = cuda.mem_alloc(h_output_8.nbytes)
    d_output_16 = cuda.mem_alloc(h_output_16.nbytes)
    d_output_32 = cuda.mem_alloc(h_output_32.nbytes)
    # Create a stream in which to copy inputs/outputs and run inference.
    stream = cuda.Stream()
 
    # set the host input data
    # h_input = img_in
    np.copyto(h_input, img_in.ravel())
    # np.copyto(h_input, img_in.unsqueeze_(0))
 
    # print(h_input)
    # Transfer input data to the GPU.
    cuda.memcpy_htod_async(d_input, h_input, stream)
    # Run inference.
    context.execute_async_v2(bindings=[int(d_input), int(d_output)], stream_handle=stream.handle)
    # Transfer predictions back from the GPU.
    cuda.memcpy_dtoh_async(h_output, d_output, stream)
    # Synchronize the stream
    stream.synchronize()
    # Return the host output. 
 
    # print(h_output)
    return h_output
 
 
img1 = cv2.imread("./538.png")
print(img1.shape)
output = get_landmarks(img1).reshape(-1, 2)

